#Baic Setting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')
#Error Detection
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
#Dummy Variable Analysis
from sklearn.linear_model import LinearRegression
import seaborn as sns
import statsmodels.api as sm
#STL Decoposition
from statsmodels.tsa.seasonal import STL, seasonal_decompose
#ARIMA Model
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.graphics.tsaplots as sgt
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima.utils import ndiffs
from pmdarima import auto_arima
import pmdarima as pmd
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
#GARCH Model
from statsmodels.stats.diagnostic import het_arch
from arch import arch_model
#White Noise
from statsmodels.stats.diagnostic import acorr_ljungbox
#Rolling Forecasting
from datetime import timedelta
CSV_file = '/Users/lokheilee/python/fyp/ARIMA/Gilead_Sciences_monthly.csv'
def parser(s):
return datetime.strptime(s, '%Y-%m-%d')
raw_df = pd.read_csv(CSV_file, parse_dates=[0], index_col=0, date_parser=parser)
#plot the data
plt.figure(figsize=(16, 9))
plt.plot(raw_df['Close'])
plt.title(' Monthly close price')
for year in range(2005, 2025):
plt.axvline(datetime(year,1,1), color='k', linestyle='--', alpha=0.2)
#dataframe setting
dummy_df = pd.read_csv(CSV_file)
# Convert Date to datetime
dummy_df['Date'] = pd.to_datetime(dummy_df['Date'])
# Create time period counter (1, 2, 3, ...)
dummy_df = dummy_df.sort_values('Date')
dummy_df['Time_Period'] = range(1, len(dummy_df) + 1)
# Ensure 'Close' is numeric
dummy_df['Close'] = pd.to_numeric(dummy_df['Close'], errors='coerce')
# Remove any rows with NaN values if they exist
dummy_df = dummy_df.dropna()
# Create monthly dummy variables
dummy_df['Month'] = dummy_df['Date'].dt.month
dummy_months = pd.get_dummies(dummy_df['Month'], prefix='Month')
# Combine time period with dummy variables
X = pd.concat([dummy_df[['Time_Period']], dummy_months.iloc[:, :-1]], axis=1)
y = dummy_df['Close']
# Convert to numpy arrays and ensure they're float type
X = np.asarray(X).astype(float)
y = np.asarray(y).astype(float)
# Add constant term
X = sm.add_constant(X)
# Perform regression
model = sm.OLS(y, X).fit()
# Get predictions
dva_predictions = model.predict(X)
# Calculate additional metrics
dva_residuals = y - dva_predictions
mse = np.mean(dva_residuals**2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(dva_residuals))
mape = mean_absolute_percentage_error(y, dva_predictions) * 100
# Print detailed results
print("\nRegression Summary:")
print(model.summary())
# Create plots
plt.figure(figsize=(12, 6))
plt.plot(dummy_df['Date'], y, label='Actual', color='blue')
plt.plot(dummy_df['Date'], dva_predictions, label='Estimated', color='red', linestyle='--')
plt.title('Actual v Dummy Variable Estimated')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
#Print MAPE
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
# Print the equation
print("\nRegression Equation:")
print("Close = ", end="")
print(f"{model.params[0]:.4f}", end="")
print(f" + {model.params[1]:.4f}*Time_Period", end="")
for i in range(2, len(model.params)):
print(f" + {model.params[i]:.4f}*Month_{i-1}", end="")
print()
# Residual Analysis
res = pd.DataFrame({
'date': dummy_df['Date'],
'resid': dva_residuals
})
res.set_index('date', inplace=True)
# Calculate statistics
res.resid_mean = res.resid.mean()
res.resid_std = res.resid.std()
lower_bound = res.resid_mean - 3 * res.resid_std
upper_bound = res.resid_mean + 3 * res.resid_std
# Plot the residuals with dates
plt.figure(figsize=(12, 8))
plt.plot(res.index, res.resid)
plt.fill_between(res.index, lower_bound, upper_bound, color='green', alpha=0.3)
plt.title('Dummy Variable Residuals Analysis')
plt.xlabel('Date')
plt.ylabel('Residuals')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Anomaly Detection
anomalies = res.resid[(res.resid < lower_bound) | (res.resid > upper_bound)]
plt.figure(figsize=(12, 8))
plt.plot(dummy_df['Date'], dummy_df['Close'])
for date in anomalies.index:
plt.axvline(date, color='r', linestyle='--', alpha=0.5)
plt.title(' Dummy Variable Time Series with Anomalies')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
print('\nDates of Anomalies:')
print(anomalies.index.strftime('%Y-%m-%d').tolist())
# Print Write Noice
white_noise_dva = acorr_ljungbox(dva_residuals, lags = [12], return_df=True)
white_noise_dva
# Print the p-value from the Ljung-Box test
print("\n")
print(f"Ljung-Box test p-value: {white_noise_dva['lb_pvalue'][12]}")
# Interpret the result
p_value = white_noise_dva['lb_pvalue'][12]
if p_value > 0.05:
print("\nInterpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests the residuals are independent (white noise).")
else:
print("\nInterpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests the residuals are not independent (not white noise).")
# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(dva_residuals)
# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")
# Interpret LM test result
if lm_pval > 0.05:
print("\nARCH LM Test Interpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests there is no ARCH effect in the residuals.")
else:
print("\nARCH LM Test Interpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests there is ARCH effect present in the residuals.")
Regression Summary:
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.752
Model: OLS Adj. R-squared: 0.739
Method: Least Squares F-statistic: 57.51
Date: Wed, 26 Mar 2025 Prob (F-statistic): 6.53e-62
Time: 23:56:39 Log-Likelihood: -940.12
No. Observations: 240 AIC: 1906.
Df Residuals: 227 BIC: 1951.
Df Model: 12
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 5.5181 3.159 1.747 0.082 -0.706 11.743
x1 0.3056 0.012 26.195 0.000 0.283 0.329
x2 0.2944 3.956 0.074 0.941 -7.501 8.090
x3 -0.9605 3.956 -0.243 0.808 -8.756 6.835
x4 -0.9965 3.956 -0.252 0.801 -8.791 6.798
x5 -1.0694 3.955 -0.270 0.787 -8.863 6.724
x6 -1.3762 3.955 -0.348 0.728 -9.170 6.417
x7 -0.9658 3.955 -0.244 0.807 -8.759 6.827
x8 -0.0614 3.955 -0.016 0.988 -7.854 7.731
x9 0.0794 3.955 0.020 0.984 -7.713 7.872
x10 -0.3503 3.954 -0.089 0.929 -8.142 7.442
x11 0.5925 3.954 0.150 0.881 -7.199 8.384
x12 1.1133 3.954 0.282 0.779 -6.678 8.905
==============================================================================
Omnibus: 50.925 Durbin-Watson: 0.074
Prob(Omnibus): 0.000 Jarque-Bera (JB): 79.640
Skew: 1.215 Prob(JB): 5.09e-18
Kurtosis: 4.434 Cond. No. 1.73e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.73e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Mean Absolute Percentage Error (MAPE): 26.72% Regression Equation: Close = 5.5181 + 0.3056*Time_Period + 0.2944*Month_1 + -0.9605*Month_2 + -0.9965*Month_3 + -1.0694*Month_4 + -1.3762*Month_5 + -0.9658*Month_6 + -0.0614*Month_7 + 0.0794*Month_8 + -0.3503*Month_9 + 0.5925*Month_10 + 1.1133*Month_11
Dates of Anomalies: ['2014-10-01', '2015-05-01', '2015-06-01', '2015-07-01'] Ljung-Box test p-value: 0.0 Interpretation: Since p-value < 0.05, we reject the null hypothesis. This suggests the residuals are not independent (not white noise). === ARCH LM Test Results === LM test statistic: 203.0241 LM test p-value: 3.775819944887155e-38 ARCH LM Test Interpretation: Since p-value < 0.05, we reject the null hypothesis. This suggests there is ARCH effect present in the residuals.
#Additive STL
#Import Data
df_stl = pd.read_csv(CSV_file, usecols=['Date', 'Close'])
df_stl = df_stl.dropna()
df_stl['Date'] = pd.to_datetime(df_stl['Date'])
df_stl.set_index('Date', inplace=True)
#Additive Decomposition
seasonal_decomp_add = seasonal_decompose(df_stl, model='additive', period=12)
df_stl['trend_add'] = seasonal_decomp_add.trend
df_stl['seasonal_add'] = seasonal_decomp_add.seasonal
df_stl['residual_add'] = seasonal_decomp_add.resid
#plot graph
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(12, 8))
# Plot original data
df_stl['Close'].plot(ax=ax1)
ax1.set_title('Original Time Series')
ax1.set_xlabel('Date')
ax1.set_ylabel('Close Price')
# Plot trend
seasonal_decomp_add.trend.plot(ax=ax2)
ax2.set_title('Trend')
ax2.set_xlabel('Date')
ax2.set_ylabel('Trend')
# Plot seasonal
seasonal_decomp_add.seasonal.plot(ax=ax3)
ax3.set_title('Seasonal')
ax3.set_xlabel('Date')
ax3.set_ylabel('Seasonal')
# Plot residual
seasonal_decomp_add.resid.plot(ax=ax4)
ax4.set_title('Residual')
ax4.set_xlabel('Date')
ax4.set_ylabel('Residual')
# Add main title
plt.suptitle('Additive Decomposition of Time Series', fontsize=16, y=1.02)
# Adjust layout
plt.tight_layout()
plt.show()
# Create diagnostic plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
# 1. Residual Plot over time
ax1.plot(df_stl.index, df_stl['residual_add'])
ax1.axhline(y=0, color='r', linestyle='--')
ax1.set_title('Residuals Over Time')
ax1.set_xlabel('Date')
ax1.set_ylabel('Residual')
# 2. Residual Histogram
sns.histplot(df_stl['residual_add'].dropna(), kde=True, ax=ax2)
ax2.set_title('Residual Distribution')
ax2.set_xlabel('Residual')
ax2.set_ylabel('Frequency')
# 3. Q-Q Plot
from scipy import stats
stats.probplot(df_stl['residual_add'].dropna(), dist="norm", plot=ax3)
ax3.set_title('Q-Q Plot')
# 4. Residual vs Fitted (Trend)
ax4.scatter(df_stl['trend_add'], df_stl['residual_add'])
ax4.axhline(y=0, color='r', linestyle='--')
ax4.set_title('Residual vs Fitted')
ax4.set_xlabel('Fitted Values (Trend)')
ax4.set_ylabel('Residuals')
plt.suptitle('Diagnostic Plots for Additive Decomposition', fontsize=16)
plt.tight_layout()
plt.show()
# Residual Analysis
res = pd.DataFrame(seasonal_decomp_add.resid)
res.columns = ['resid']
# Calculate statistics
res.resid_mean = res.resid.mean()
res.resid_std = res.resid.std()
lower_bound = res.resid_mean - 3 * res.resid_std
upper_bound = res.resid_mean + 3 * res.resid_std
# Plot the residuals
plt.figure(figsize=(12, 8))
plt.plot(res.resid)
plt.fill_between(res.resid.index, lower_bound, upper_bound, color='green', alpha=0.3)
plt.title('Additive Residuals Analysis')
plt.xlim(res.resid.index[0], res.resid.index[-1])
plt.show()
# Anomaly Detection
anomalies = res.resid[(res.resid < lower_bound) | (res.resid > upper_bound)]
plt.figure(figsize=(12, 8))
plt.plot(raw_df['Close'])
for i in anomalies.index:
plt.axvline(i, color='r', linestyle='--', alpha=0.5)
plt.title('Additive STL Time Series with Anomalies')
plt.show()
print(f'Dates of Anomalies: {anomalies.index}')
#forcastin by addictive
estimated_add = df_stl['trend_add'] + df_stl['seasonal_add']
plt.figure(figsize=(16, 9))
plt.plot(raw_df['Close'], label='Original')
plt.plot(estimated_add, label='Estimated_Add')
plt.title('Original vs Estimated(Additive STL)')
plt.legend()
for year in range(2005, 2025):
plt.axvline(datetime(year,1,1), color='k', linestyle='--', alpha=0.2)
plt.show()
# mean absolute percentage error and root mean square error
mape = np.mean(np.abs((raw_df['Close'] - estimated_add) / raw_df['Close'])) * 100
rmse = np.mean((raw_df['Close'] - estimated_add) ** 2) ** 0.5
print(f'Mean Absolute Percentage Error: {mape}')
print(f'Root Mean Square Error: {rmse}')
# Print Write Noice
white_noise_add = acorr_ljungbox(df_stl['residual_add'].dropna(), lags = [12], return_df=True)
white_noise_add
# Print the p-value from the Ljung-Box test
print("\n")
print(f"Ljung-Box test p-value: {white_noise_add['lb_pvalue'][12]}")
# Interpret the result
p_value = white_noise_add['lb_pvalue'][12]
if p_value > 0.05:
print("\nInterpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests the residuals are independent (white noise).")
else:
print("\nInterpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests the residuals are not independent (not white noise).")
# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(df_stl['residual_add'].dropna())
# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")
# Interpret LM test result
if lm_pval > 0.05:
print("\nARCH LM Test Interpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests there is no ARCH effect in the residuals.")
else:
print("\nARCH LM Test Interpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests there is ARCH effect present in the residuals.")
Dates of Anomalies: DatetimeIndex(['2020-04-01', '2022-11-01', '2024-05-01'], dtype='datetime64[ns]', name='Date', freq=None)
Mean Absolute Percentage Error: 5.947847323694597 Root Mean Square Error: 3.382089684320981 Ljung-Box test p-value: 1.0220833073493274e-27 Interpretation: Since p-value < 0.05, we reject the null hypothesis. This suggests the residuals are not independent (not white noise). === ARCH LM Test Results === LM test statistic: 55.7577 LM test p-value: 2.2790094839537187e-08 ARCH LM Test Interpretation: Since p-value < 0.05, we reject the null hypothesis. This suggests there is ARCH effect present in the residuals.
#GARCH Model
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
plot_acf(df_stl['residual_add'].dropna(), lags=40, zero=False, ax=ax2, title='Autocorrelation (For MA term)')
plot_pacf(df_stl['residual_add'].dropna(), lags=40, zero=False, ax=ax1, title = 'Partial Autocorrelation (For AR term)')
plt.show()
mdl_garch = arch_model(df_stl['residual_add'].dropna(), vol = 'GARCH', p = 1, q = 3)
garch_fit = mdl_garch.fit()
print(garch_fit.summary())
garch_std_resid = pd.Series(garch_fit.resid / garch_fit.conditional_volatility)
fig = plt.figure(figsize = (15, 8))
# Residual
garch_std_resid.plot(ax = fig.add_subplot(3,1,1), title = 'GARCH Standardized-Residual', legend = False)
# ACF/PACF
sgt.plot_acf(garch_std_resid, zero = False, lags = 40, ax=fig.add_subplot(3,2,3))
sgt.plot_pacf(garch_std_resid, zero = False, lags = 40, ax=fig.add_subplot(3,2,4))
# QQ-Plot & Norm-Dist
sm.qqplot(garch_std_resid, line='s', ax=fig.add_subplot(3,2,5))
plt.title("QQ Plot")
fig.add_subplot(3,2,6).hist(garch_std_resid, bins = 12)
plt.title("Histogram")
plt.tight_layout()
plt.show()
# Print Write Noice
white_noise_add = acorr_ljungbox(garch_std_resid, lags = [12], return_df=True)
white_noise_add
# Print the p-value from the Ljung-Box test
print("\n")
print(f"Ljung-Box test p-value: {white_noise_add['lb_pvalue'][12]}")
# Interpret the result
p_value = white_noise_add['lb_pvalue'][12]
if p_value > 0.05:
print("\nInterpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests the residuals are independent (white noise).")
else:
print("\nInterpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests the residuals are not independent (not white noise).")
# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(garch_std_resid)
# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")
# Interpret LM test result
if lm_pval > 0.05:
print("\nARCH LM Test Interpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests there is no ARCH effect in the residuals.")
else:
print("\nARCH LM Test Interpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests there is ARCH effect present in the residuals.")
Iteration: 1, Func. Count: 8, Neg. LLF: 352282.5623818712
Iteration: 2, Func. Count: 16, Neg. LLF: 2099.191642868249
Iteration: 3, Func. Count: 24, Neg. LLF: 590.1877487688679
Iteration: 4, Func. Count: 33, Neg. LLF: 534.736507352731
Iteration: 5, Func. Count: 41, Neg. LLF: 534.1803505112179
Iteration: 6, Func. Count: 49, Neg. LLF: 534.8150314818827
Iteration: 7, Func. Count: 57, Neg. LLF: 533.8966257897875
Iteration: 8, Func. Count: 64, Neg. LLF: 533.8765961682659
Iteration: 9, Func. Count: 71, Neg. LLF: 533.8765508268564
Iteration: 10, Func. Count: 78, Neg. LLF: 533.8765482164501
Iteration: 11, Func. Count: 84, Neg. LLF: 533.8765478170626
Optimization terminated successfully (Exit mode 0)
Current function value: 533.8765482164501
Iterations: 11
Function evaluations: 84
Gradient evaluations: 11
Constant Mean - GARCH Model Results
==============================================================================
Dep. Variable: residual_add R-squared: 0.000
Mean Model: Constant Mean Adj. R-squared: 0.000
Vol Model: GARCH Log-Likelihood: -533.877
Distribution: Normal AIC: 1079.75
Method: Maximum Likelihood BIC: 1100.33
No. Observations: 228
Date: Wed, Mar 26 2025 Df Residuals: 227
Time: 23:56:41 Df Model: 1
Mean Model
==========================================================================
coef std err t P>|t| 95.0% Conf. Int.
--------------------------------------------------------------------------
mu -0.1516 0.126 -1.201 0.230 [ -0.399,9.578e-02]
Volatility Model
===========================================================================
coef std err t P>|t| 95.0% Conf. Int.
---------------------------------------------------------------------------
omega 0.2276 0.159 1.428 0.153 [-8.483e-02, 0.540]
alpha[1] 0.4088 9.738e-02 4.198 2.695e-05 [ 0.218, 0.600]
beta[1] 0.1408 0.173 0.815 0.415 [ -0.198, 0.479]
beta[2] 0.1771 0.376 0.470 0.638 [ -0.561, 0.915]
beta[3] 0.2733 0.417 0.655 0.513 [ -0.545, 1.091]
===========================================================================
Covariance estimator: robust
Ljung-Box test p-value: 7.081596215629725e-17 Interpretation: Since p-value < 0.05, we reject the null hypothesis. This suggests the residuals are not independent (not white noise). === ARCH LM Test Results === LM test statistic: 5.4971 LM test p-value: 0.855599747400229 ARCH LM Test Interpretation: Since p-value > 0.05, we fail to reject the null hypothesis. This suggests there is no ARCH effect in the residuals.
#Multiplicative Decomposition
from statsmodels.tsa.seasonal import STL, seasonal_decompose
seasonal_decomp_multi = seasonal_decompose(df_stl['Close'], model='multiplicative', period=12)
df_stl['trend_multi'] = seasonal_decomp_multi.trend
df_stl['seasonal_multi'] = seasonal_decomp_multi.seasonal
df_stl['residual_multi'] = seasonal_decomp_multi.resid
#plot graph
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(12, 8))
# Plot original data
df_stl['Close'].plot(ax=ax1)
ax1.set_title('Original Time Series')
ax1.set_xlabel('Date')
ax1.set_ylabel('Close Price')
# Plot trend
seasonal_decomp_multi.trend.plot(ax=ax2)
ax2.set_title('Trend')
ax2.set_xlabel('Date')
ax2.set_ylabel('Trend')
# Plot seasonal
seasonal_decomp_multi.seasonal.plot(ax=ax3)
ax3.set_title('Seasonal')
ax3.set_xlabel('Date')
ax3.set_ylabel('Seasonal')
# Plot residual
seasonal_decomp_multi.resid.plot(ax=ax4)
ax4.set_title('Residual')
ax4.set_xlabel('Date')
ax4.set_ylabel('Residual')
# Add main title
plt.suptitle('Multipliacative Decomposition of Time Series', fontsize=16, y=1.02)
# Adjust layout
plt.tight_layout()
plt.show()
# Create diagnostic plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
# 1. Residual Plot over time
ax1.plot(df_stl.index, df_stl['residual_multi'])
ax1.axhline(y=0, color='r', linestyle='--')
ax1.set_title('Residuals Over Time')
ax1.set_xlabel('Date')
ax1.set_ylabel('Residual')
# 2. Residual Histogram
sns.histplot(df_stl['residual_multi'].dropna(), kde=True, ax=ax2)
ax2.set_title('Residual Distribution')
ax2.set_xlabel('Residual')
ax2.set_ylabel('Frequency')
# 3. Q-Q Plot
from scipy import stats
stats.probplot(df_stl['residual_multi'].dropna(), dist="norm", plot=ax3)
ax3.set_title('Q-Q Plot')
# 4. Residual vs Fitted (Trend)
ax4.scatter(df_stl['trend_multi'], df_stl['residual_multi'])
ax4.axhline(y=0, color='r', linestyle='--')
ax4.set_title('Residual vs Fitted')
ax4.set_xlabel('Fitted Values (Trend)')
ax4.set_ylabel('Residuals')
plt.suptitle('Diagnostic Plots for Multipliacative Decomposition', fontsize=16)
plt.tight_layout()
plt.show()
# Residual Analysis
res = pd.DataFrame(seasonal_decomp_multi.resid)
res.columns = ['resid']
# Calculate statistics
res.resid_mean = res.resid.mean()
res.resid_std = res.resid.std()
lower_bound = res.resid_mean - 3 * res.resid_std
upper_bound = res.resid_mean + 3 * res.resid_std
# Plot the residuals
plt.figure(figsize=(12, 8))
plt.plot(res.resid)
plt.fill_between(res.resid.index, lower_bound, upper_bound, color='green', alpha=0.3)
plt.title('Multipliacative Residuals Analysis')
plt.xlim(res.resid.index[0], res.resid.index[-1])
plt.show()
# Anomaly Detection
anomalies = res.resid[(res.resid < lower_bound) | (res.resid > upper_bound)]
plt.figure(figsize=(12, 8))
plt.plot(raw_df['Close'])
for i in anomalies.index:
plt.axvline(i, color='r', linestyle='--', alpha=0.5)
plt.title('Multipliacative STL Time Series with Anomalies')
plt.show()
print(f'Dates of Anomalies: {anomalies.index}')
#forcast in by multiactive
estimated_multi = df_stl['trend_multi'] + df_stl['seasonal_multi']
plt.figure(figsize=(16, 9))
plt.plot(raw_df['Close'], label='Original')
plt.plot(estimated_multi, label='Estimated_Multi')
plt.title('Original vs Estimated(Multiple STL)')
plt.legend()
for year in range(2005, 2025):
plt.axvline(datetime(year,1,1), color='k', linestyle='--', alpha=0.2)
plt.show()
# mean absolute percentage error and root mean square error
mape = np.mean(np.abs((raw_df['Close'] - estimated_multi) / raw_df['Close'])) * 100
rmse = np.mean((raw_df['Close'] - estimated_multi) ** 2) ** 0.5
print(f'Mean Absolute Percentage Error: {mape}')
print(f'Root Mean Square Error: {rmse}')
# Print Write Noice
white_noise_multi = acorr_ljungbox(df_stl['residual_multi'].dropna(), lags = [12], return_df=True)
white_noise_multi
# Print the p-value from the Ljung-Box test
print("\n=== Ljung-Box test Results ===")
print(f"Ljung-Box test p-value: {white_noise_multi['lb_pvalue'][12]}")
# Interpret the result
p_value = white_noise_multi['lb_pvalue'][12]
if p_value > 0.05:
print("\nInterpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests the residuals are independent (white noise).")
else:
print("\nInterpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests the residuals are not independent (not white noise).")
# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(df_stl['residual_multi'].dropna())
# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")
# Interpret LM test result
if lm_pval > 0.05:
print("\nARCH LM Test Interpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests there is no ARCH effect in the residuals.")
else:
print("\nARCH LM Test Interpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests there is ARCH effect present in the residuals.")
Dates of Anomalies: DatetimeIndex(['2020-04-01'], dtype='datetime64[ns]', name='Date', freq=None)
Mean Absolute Percentage Error: 7.114270037834504 Root Mean Square Error: 3.603418909131697 === Ljung-Box test Results === Ljung-Box test p-value: 3.5063073205305405e-21 Interpretation: Since p-value < 0.05, we reject the null hypothesis. This suggests the residuals are not independent (not white noise). === ARCH LM Test Results === LM test statistic: 85.8593 LM test p-value: 3.5326063367906394e-14 ARCH LM Test Interpretation: Since p-value < 0.05, we reject the null hypothesis. This suggests there is ARCH effect present in the residuals.
#GARCH Model
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
plot_acf(df_stl['residual_multi'].dropna(), lags=40, zero=False, ax=ax2, title='Autocorrelation (For MA term)')
plot_pacf(df_stl['residual_multi'].dropna(), lags=40, zero=False, ax=ax1, title = 'Partial Autocorrelation (For AR term)')
plt.show()
mdl_garch = arch_model(df_stl['residual_multi'].dropna(), vol = 'GARCH', p = 1, q = 3)
garch_fit = mdl_garch.fit()
print(garch_fit.summary())
garch_std_resid = pd.Series(garch_fit.resid / garch_fit.conditional_volatility)
fig = plt.figure(figsize = (15, 8))
# Residual
garch_std_resid.plot(ax = fig.add_subplot(3,1,1), title = 'GARCH Standardized-Residual', legend = False)
# ACF/PACF
sgt.plot_acf(garch_std_resid, zero = False, lags = 40, ax=fig.add_subplot(3,2,4))
sgt.plot_pacf(garch_std_resid, zero = False, lags = 40, ax=fig.add_subplot(3,2,3))
# QQ-Plot & Norm-Dist
sm.qqplot(garch_std_resid, line='s', ax=fig.add_subplot(3,2,5))
plt.title("QQ Plot")
fig.add_subplot(3,2,6).hist(garch_std_resid, bins = 12)
plt.title("Histogram")
plt.tight_layout()
plt.show()
# Print Write Noice
white_noise_add = acorr_ljungbox(garch_std_resid, lags = [12], return_df=True)
white_noise_add
# Print the p-value from the Ljung-Box test
print("\n")
print(f"Ljung-Box test p-value: {white_noise_add['lb_pvalue'][12]}")
# Interpret the result
p_value = white_noise_add['lb_pvalue'][12]
if p_value > 0.05:
print("\nInterpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests the residuals are independent (white noise).")
else:
print("\nInterpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests the residuals are not independent (not white noise).")
# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(garch_std_resid)
# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")
# Interpret LM test result
if lm_pval > 0.05:
print("\nARCH LM Test Interpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests there is no ARCH effect in the residuals.")
else:
print("\nARCH LM Test Interpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests there is ARCH effect present in the residuals.")
Iteration: 1, Func. Count: 8, Neg. LLF: 2049087.0251516781
Iteration: 2, Func. Count: 20, Neg. LLF: 2692.340695813669
Iteration: 3, Func. Count: 28, Neg. LLF: -279.4323496771234
Iteration: 4, Func. Count: 37, Neg. LLF: -175.8420853791262
Iteration: 5, Func. Count: 45, Neg. LLF: -301.3392313723543
Iteration: 6, Func. Count: 52, Neg. LLF: -301.27062791660154
Iteration: 7, Func. Count: 60, Neg. LLF: -300.9407438011037
Iteration: 8, Func. Count: 68, Neg. LLF: -301.4506053907312
Iteration: 9, Func. Count: 76, Neg. LLF: -301.48019145227585
Iteration: 10, Func. Count: 83, Neg. LLF: -301.48022617358356
Iteration: 11, Func. Count: 90, Neg. LLF: -301.48022777839446
Iteration: 12, Func. Count: 96, Neg. LLF: -301.4802277677842
Optimization terminated successfully (Exit mode 0)
Current function value: -301.48022777839446
Iterations: 12
Function evaluations: 96
Gradient evaluations: 12
Constant Mean - GARCH Model Results
==============================================================================
Dep. Variable: residual_multi R-squared: 0.000
Mean Model: Constant Mean Adj. R-squared: 0.000
Vol Model: GARCH Log-Likelihood: 301.480
Distribution: Normal AIC: -590.960
Method: Maximum Likelihood BIC: -570.384
No. Observations: 228
Date: Wed, Mar 26 2025 Df Residuals: 227
Time: 23:56:44 Df Model: 1
Mean Model
========================================================================
coef std err t P>|t| 95.0% Conf. Int.
------------------------------------------------------------------------
mu 0.9927 7.905e-03 125.574 0.000 [ 0.977, 1.008]
Volatility Model
=============================================================================
coef std err t P>|t| 95.0% Conf. Int.
-----------------------------------------------------------------------------
omega 2.2933e-03 1.628e-03 1.408 0.159 [-8.982e-04,5.485e-03]
alpha[1] 0.3674 0.151 2.436 1.487e-02 [7.174e-02, 0.663]
beta[1] 2.2990e-16 0.223 1.029e-15 1.000 [ -0.438, 0.438]
beta[2] 0.1390 0.766 0.182 0.856 [ -1.362, 1.640]
beta[3] 1.6009e-16 0.414 3.870e-16 1.000 [ -0.811, 0.811]
=============================================================================
Covariance estimator: robust
Ljung-Box test p-value: 2.677826057842571e-15 Interpretation: Since p-value < 0.05, we reject the null hypothesis. This suggests the residuals are not independent (not white noise). === ARCH LM Test Results === LM test statistic: 2.5351 LM test p-value: 0.9903546162860751 ARCH LM Test Interpretation: Since p-value > 0.05, we fail to reject the null hypothesis. This suggests there is no ARCH effect in the residuals.
garch_fit.summary()
| Dep. Variable: | residual_multi | R-squared: | 0.000 |
|---|---|---|---|
| Mean Model: | Constant Mean | Adj. R-squared: | 0.000 |
| Vol Model: | GARCH | Log-Likelihood: | 301.480 |
| Distribution: | Normal | AIC: | -590.960 |
| Method: | Maximum Likelihood | BIC: | -570.384 |
| No. Observations: | 228 | ||
| Date: | Wed, Mar 26 2025 | Df Residuals: | 227 |
| Time: | 23:56:44 | Df Model: | 1 |
| coef | std err | t | P>|t| | 95.0% Conf. Int. | |
|---|---|---|---|---|---|
| mu | 0.9927 | 7.905e-03 | 125.574 | 0.000 | [ 0.977, 1.008] |
| coef | std err | t | P>|t| | 95.0% Conf. Int. | |
|---|---|---|---|---|---|
| omega | 2.2933e-03 | 1.628e-03 | 1.408 | 0.159 | [-8.982e-04,5.485e-03] |
| alpha[1] | 0.3674 | 0.151 | 2.436 | 1.487e-02 | [7.174e-02, 0.663] |
| beta[1] | 2.2990e-16 | 0.223 | 1.029e-15 | 1.000 | [ -0.438, 0.438] |
| beta[2] | 0.1390 | 0.766 | 0.182 | 0.856 | [ -1.362, 1.640] |
| beta[3] | 1.6009e-16 | 0.414 | 3.870e-16 | 1.000 | [ -0.811, 0.811] |
#ARIMA Model
#Import data
arima_df = raw_df['Close']
#infer the frequency of the data
df_freq = arima_df.asfreq(pd.infer_freq(raw_df.index))
#ADF Test (for stationarity)
result = adfuller(arima_df.dropna())
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')
if result[1] <= 0.05:
print('Data is stationary')
diff_order = 0
else:
print('Data is non-stationary')
diff_order = 1
diff = arima_df.diff()[1:]
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
ax1.plot(arima_df)
ax1.set_title('Monthly Close Price')
ax2.plot(diff)
ax2.set_title('Monthly Close Price (Difference Once)')
plt.show()
#ACF and PACF plots for the data
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
if diff_order == 0:
setting = arima_df
else:
setting = diff
plot_acf(setting, lags=20, zero=False, ax=ax2, title='Autocorrelation (For MA term)')
plot_pacf(setting, lags=20, zero=False, ax=ax1, title = 'Partial Autocorrelation (For AR term)')
plt.show()
#Arima Model
pmd_mdl = pmd.auto_arima(y = arima_df,
d = diff_order)
print(pmd_mdl.summary())
# Get fitted values (in-sample predictions)
fitted_values = pd.Series(pmd_mdl.predict_in_sample(), index=arima_df.index)
# Calculate residuals
residuals = arima_df - fitted_values
# Create comparison DataFrame
comparison = pd.DataFrame({
'Actual': arima_df,
'Estimated': fitted_values,
'Residuals': residuals
})
# Plotting
fig, (ax1) = plt.subplots(1, 1, figsize=(12, 8))
# Plot 1: Actual vs Estimated
ax1.plot(arima_df.index, arima_df, label='Actual', color='blue')
ax1.plot(arima_df.index, fitted_values, label='Estimated', color='red', linestyle='--')
ax1.set_title('Actual vs Estimated Values ARIMA Model')
ax1.legend()
ax1.grid(True)
# Plot 2: Residuals
ax2.plot(arima_df.index, residuals, color='green')
ax2.axhline(y=0, color='r', linestyle='--')
ax2.set_title('Residuals')
ax2.grid(True)
plt.tight_layout()
plt.show()
# Calculate performance metrics
mse = ((residuals) ** 2).mean()
rmse = np.sqrt(mse)
mae = abs(residuals).mean()
mape = (abs(residuals) / arima_df * 100).mean()
print("\nPerformance Metrics:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.4f}%")
pmd_mdl.plot_diagnostics(figsize = (15, 10))
plt.show()
# Calculate statistics
res.resid_mean = residuals.mean()
res.resid_std = residuals.std()
lower_bound = res.resid_mean - 3 * res.resid_std
upper_bound = res.resid_mean + 3 * res.resid_std
# Plot the residuals
plt.figure(figsize=(12, 8))
plt.plot(residuals)
plt.fill_between(residuals.index, lower_bound, upper_bound, color='green', alpha=0.3)
plt.title('ARIMA Residuals Analysis')
plt.xlim(residuals.index[0], residuals.index[-1])
plt.show()
# Anomaly Detection
anomalies = residuals[(residuals < lower_bound) | (residuals > upper_bound)]
plt.figure(figsize=(12, 8))
plt.plot(raw_df['Close'])
for i in anomalies.index:
plt.axvline(i, color='r', linestyle='--', alpha=0.5)
plt.title('ARIMA Time Series with Anomalies')
plt.show()
print(f'Dates of Anomalies: {anomalies.index}')
# Print Write Noice
white_noise_arima = acorr_ljungbox(residuals, lags = [12], return_df=True)
white_noise_arima
# Print the p-value from the Ljung-Box test
print("\n=== Ljung-Box test Results ===")
print(f"Ljung-Box test p-value: {white_noise_arima['lb_pvalue'][12]}")
# Interpret the result
p_value = white_noise_arima['lb_pvalue'][12]
if p_value > 0.05:
print("\nInterpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests the residuals are independent (white noise).")
else:
print("\nInterpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests the residuals are not independent (not white noise).")
# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(residuals)
# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")
# Interpret LM test result
if lm_pval > 0.05:
print("\nARCH LM Test Interpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests there is no ARCH effect in the residuals.")
else:
print("\nARCH LM Test Interpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests there is ARCH effect present in the residuals.")
ADF Statistic: -0.6472245123232313 p-value: 0.859950295455792 Data is non-stationary
SARIMAX Results
==============================================================================
Dep. Variable: y No. Observations: 240
Model: SARIMAX(2, 1, 2) Log Likelihood -625.968
Date: Wed, 26 Mar 2025 AIC 1263.935
Time: 23:56:46 BIC 1284.794
Sample: 01-01-2005 HQIC 1272.341
- 12-01-2024
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
intercept 0.6014 0.392 1.532 0.125 -0.168 1.371
ar.L1 0.2478 0.041 5.985 0.000 0.167 0.329
ar.L2 -0.9429 0.033 -28.417 0.000 -1.008 -0.878
ma.L1 -0.1825 0.040 -4.514 0.000 -0.262 -0.103
ma.L2 0.9445 0.037 25.434 0.000 0.872 1.017
sigma2 11.0018 0.807 13.636 0.000 9.420 12.583
===================================================================================
Ljung-Box (L1) (Q): 0.57 Jarque-Bera (JB): 54.48
Prob(Q): 0.45 Prob(JB): 0.00
Heteroskedasticity (H): 18.10 Skew: 0.42
Prob(H) (two-sided): 0.00 Kurtosis: 5.18
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Performance Metrics: MSE: 11.0818 RMSE: 3.3289 MAE: 2.3417 MAPE: 5.9224%
Dates of Anomalies: DatetimeIndex(['2014-08-01', '2016-01-01', '2022-10-01'], dtype='datetime64[ns]', name='Date', freq=None) === Ljung-Box test Results === Ljung-Box test p-value: 0.3998817785935515 Interpretation: Since p-value > 0.05, we fail to reject the null hypothesis. This suggests the residuals are independent (white noise). === ARCH LM Test Results === LM test statistic: 25.6393 LM test p-value: 0.004256812590031472 ARCH LM Test Interpretation: Since p-value < 0.05, we reject the null hypothesis. This suggests there is ARCH effect present in the residuals.
# Data preparation
dummy_df = pd.read_csv(CSV_file)
dummy_df['Date'] = pd.to_datetime(dummy_df['Date'])
dummy_df = dummy_df.sort_values('Date')
dummy_df['Time_Period'] = range(1, len(dummy_df) + 1)
dummy_df['Close'] = pd.to_numeric(dummy_df['Close'], errors='coerce')
dummy_df = dummy_df.dropna()
# Create monthly dummy variables - ensure they're numeric
dummy_df['Month'] = dummy_df['Date'].dt.month
dummy_months = pd.get_dummies(dummy_df['Month'], prefix='Month').astype(float)
# Combine features and ensure numeric types
X = pd.concat([dummy_df[['Time_Period']].astype(float), dummy_months.iloc[:, :-1]], axis=1)
y = dummy_df['Close'].astype(float)
# Add constant term
X = sm.add_constant(X)
# Train/test split
train_mask = dummy_df['Date'] <= '2023-12-31'
test_mask = (dummy_df['Date'] > '2023-12-31') & (dummy_df['Date'] <= '2024-12-31')
# Convert to numpy arrays explicitly
X_train = X[train_mask].values.astype(float)
X_test = X[test_mask].values.astype(float)
y_train = y[train_mask].values.astype(float)
y_test = y[test_mask].values.astype(float)
# Model fitting
try:
model = sm.OLS(y_train, X_train).fit()
except Exception as e:
print("Error in model fitting:", e)
# Debugging info
print("X_train dtype:", X_train.dtype)
print("y_train dtype:", y_train.dtype)
print("NaN in X_train:", np.isnan(X_train).sum())
print("NaN in y_train:", np.isnan(y_train).sum())
raise
# Predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
all_pred = model.predict(X.values.astype(float))
# MAPE calculations
train_mape = mean_absolute_percentage_error(y_train, train_pred) * 100
test_mape = mean_absolute_percentage_error(y_test, test_pred) * 100
whole_mape = mean_absolute_percentage_error(y, all_pred) * 100
dva_train_mape = train_mape
dva_test_mape = test_mape
dva_whole_mape = whole_mape
print(f"Train MAPE: {train_mape:.2f}%")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Whole Data MAPE: {whole_mape:.2f}%")
# Plot
plt.figure(figsize=(12, 6))
plt.plot(dummy_df['Date'], y, label='Actual', color='blue')
plt.plot(dummy_df['Date'][train_mask], train_pred, label='Train Estimated', color='orange')
plt.plot(dummy_df['Date'][test_mask], test_pred, label='Test Estimated', color='green')
# Add vertical line to separate train and test
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
f'Dummy Variable Model: Prediction\n'
f'Train MAPE: {train_mape:.2f}%, '
f'Test MAPE: {test_mape:.2f}%, '
f'Whole Data MAPE: {whole_mape:.2f}%'
)
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Train MAPE: 27.19% Test MAPE: 10.68% Whole Data MAPE: 26.37%
# Data preparation
df_stl = pd.read_csv(CSV_file, usecols=['Date', 'Close'])
df_stl['Date'] = pd.to_datetime(df_stl['Date'])
df_stl.set_index('Date', inplace=True)
# Decomposition
seasonal_decomp_add = seasonal_decompose(df_stl, model='additive', period=12)
df_stl['trend_add'] = seasonal_decomp_add.trend
df_stl['seasonal_add'] = seasonal_decomp_add.seasonal
# Train/test split
train_mask = df_stl.index <= '2023-12-31'
test_mask = (df_stl.index > '2023-12-31') & (df_stl.index <= '2024-12-31')
# Create estimates
estimated_add = df_stl['trend_add'] + df_stl['seasonal_add']
# MAPE calculations
train_mape = np.mean(np.abs((df_stl['Close'][train_mask] - estimated_add[train_mask]) / df_stl['Close'][train_mask])) * 100
test_mape = np.mean(np.abs((df_stl['Close'][test_mask] - estimated_add[test_mask]) / df_stl['Close'][test_mask])) * 100
whole_mape = np.mean(np.abs((df_stl['Close'] - estimated_add) / df_stl['Close'])) * 100
add_train_mape = train_mape
add_test_mape = test_mape
add_whole_mape = whole_mape
print(f"Train MAPE: {train_mape:.2f}%")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Whole Data MAPE: {whole_mape:.2f}%")
# Plot
plt.figure(figsize=(12, 6))
plt.plot(df_stl['Close'], label='Actual', color='blue')
plt.plot(estimated_add[train_mask], label='Train Estimated', color='orange')
plt.plot(estimated_add[test_mask], label='Test Estimated', color='green')
# Add vertical line to separate train and test
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
f'Additive STL: Prediction\n'
f'Train MAPE: {train_mape:.2f}%, '
f'Test MAPE: {test_mape:.2f}%, '
f'Whole Data MAPE: {whole_mape:.2f}%'
)
plt.legend()
plt.tight_layout()
plt.show()
Train MAPE: 5.88% Test MAPE: 8.64% Whole Data MAPE: 5.95%
# Data preparation (same as additive)
df_stl = pd.read_csv(CSV_file, usecols=['Date', 'Close'])
df_stl['Date'] = pd.to_datetime(df_stl['Date'])
df_stl.set_index('Date', inplace=True)
# Decomposition
seasonal_decomp_mul = seasonal_decompose(df_stl, model='multiplicative', period=12)
df_stl['trend_mul'] = seasonal_decomp_mul.trend
df_stl['seasonal_mul'] = seasonal_decomp_mul.seasonal
# Train/test split
train_mask = df_stl.index <= '2023-12-31'
test_mask = (df_stl.index > '2023-12-31') & (df_stl.index <= '2024-12-31')
# Create estimates
estimated_mul = df_stl['trend_mul'] * df_stl['seasonal_mul']
# MAPE calculations
train_mape = np.mean(np.abs((df_stl['Close'][train_mask] - estimated_mul[train_mask]) / df_stl['Close'][train_mask])) * 100
test_mape = np.mean(np.abs((df_stl['Close'][test_mask] - estimated_mul[test_mask]) / df_stl['Close'][test_mask])) * 100
whole_mape = np.mean(np.abs((df_stl['Close'] - estimated_mul) / df_stl['Close'])) * 100
multi_train_mape = train_mape
multi_test_mape = test_mape
multi_whole_mape = whole_mape
print(f"Train MAPE: {train_mape:.2f}%")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Whole Data MAPE: {whole_mape:.2f}%")
# Plot
plt.figure(figsize=(12, 6))
plt.plot(df_stl['Close'], label='Actual', color='blue')
plt.plot(estimated_mul[train_mask], label='Train Estimated', color='orange')
plt.plot(estimated_mul[test_mask], label='Test Estimated', color='green')
# Add vertical line to separate train and test
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
f'Multiplicative STL: Prediction\n'
f'Train MAPE: {train_mape:.2f}%, '
f'Test MAPE: {test_mape:.2f}%, '
f'Whole Data MAPE: {whole_mape:.2f}%'
)
plt.legend()
plt.tight_layout()
plt.show()
Train MAPE: 5.36% Test MAPE: 8.74% Whole Data MAPE: 5.45%
#rolling forecasting by ARIMA
#Split the data
train_end = datetime(2023, 12, 31)
test_end = datetime(2024, 12, 31)
# Assuming you want to predict 'Close' column - adjust column name as needed
train_data = raw_df['Close'][:train_end]
test_data = raw_df['Close'][train_end + timedelta(days=1):test_end]
print(f'Train data shape: {train_data.shape}')
print(f'Test data shape: {test_data.shape}')
my_order = pmd_mdl.order
#using the rolling forecast origin
rolling_predictions = test_data.copy()
for train_end in test_data.index:
train_data = raw_df['Close'][:train_end - timedelta(days=1)]
model = SARIMAX(endog=train_data, order=my_order)
model_fit = model.fit()
pred = model_fit.forecast()
rolling_predictions[train_end] = pred
print(model_fit.summary())
rolling_residuals = test_data - rolling_predictions
plt.figure(figsize=(10, 4))
plt.plot(rolling_residuals)
plt.axhline(0, linestyle='--', color='k')
plt.title('Rolling Forecast Residuals')
plt.ylabel('Error')
plt.figure(figsize=(10, 4))
plt.plot(test_data, label='Actual')
plt.plot(rolling_predictions, label='Predicted')
plt.title('Rolling Forecast')
plt.ylabel('Close Price')
plt.legend()
print('Mean Absolute Percentage Error:', round(np.mean(abs(rolling_residuals/test_data))*100, 4))
print('Root Mean Squared Error:', np.sqrt(np.mean(rolling_residuals**2)))
Train data shape: (228,)
Test data shape: (12,)
SARIMAX Results
==============================================================================
Dep. Variable: Close No. Observations: 239
Model: SARIMAX(2, 1, 2) Log Likelihood -625.069
Date: Wed, 26 Mar 2025 AIC 1260.137
Time: 23:56:49 BIC 1277.499
Sample: 01-01-2005 HQIC 1267.134
- 11-01-2024
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.2476 0.045 5.520 0.000 0.160 0.335
ar.L2 -0.9381 0.035 -26.485 0.000 -1.007 -0.869
ma.L1 -0.1789 0.043 -4.114 0.000 -0.264 -0.094
ma.L2 0.9404 0.039 24.065 0.000 0.864 1.017
sigma2 11.1611 0.795 14.043 0.000 9.603 12.719
===================================================================================
Ljung-Box (L1) (Q): 0.61 Jarque-Bera (JB): 51.69
Prob(Q): 0.44 Prob(JB): 0.00
Heteroskedasticity (H): 19.31 Skew: 0.41
Prob(H) (two-sided): 0.00 Kurtosis: 5.13
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Mean Absolute Percentage Error: 5.4819
Root Mean Squared Error: 4.648457454156803
#Split the data
train_end = datetime(2023, 12, 31)
test_end = datetime(2024, 12, 31)
# Assuming you want to predict 'Close' column
train_data = raw_df['Close'][:train_end]
test_data = raw_df['Close'][train_end + timedelta(days=1):test_end]
print(f'Train data shape: {train_data.shape}')
print(f'Test data shape: {test_data.shape}')
my_order = pmd_mdl.order
# Get training predictions
train_predictions = train_data.copy()
for train_idx in train_data.index[50:]: # Starting from 50 to have enough data for initial prediction
temp_train = raw_df['Close'][:train_idx - timedelta(days=1)]
model = SARIMAX(endog=temp_train, order=my_order)
model_fit = model.fit()
pred = model_fit.forecast()
train_predictions[train_idx] = pred
# Calculate training residuals (add this after your train_predictions calculation)
train_residuals = train_data[train_predictions.index] - train_predictions # Align indices first
train_residuals = train_residuals.dropna()
# Get testing predictions
rolling_predictions = test_data.copy()
for train_end in test_data.index:
train_data = raw_df['Close'][:train_end - timedelta(days=1)]
model = SARIMAX(endog=train_data, order=my_order)
model_fit = model.fit()
pred = model_fit.forecast()
rolling_predictions[train_end] = pred
# Calculate metrics for test set
rolling_residuals = test_data - rolling_predictions
train_residuals = train_data - train_predictions
# Calculate MAPE
# Calculate MAPE for test set (in percentage)
test_mape = round(np.mean(abs(rolling_residuals/test_data)) * 100, 2) # Multiply by 100 for percentage
print('Test Set Mean Absolute Percentage Error:', test_mape, '%')
# Calculate MAPE for training set (in percentage)
train_mape = round(np.mean(abs(train_residuals/train_data[train_residuals.index])) * 100, 2)
print('Training Set Mean Absolute Percentage Error:', train_mape, '%')
# Calculate MAPE for full dataset (in percentage)
full_predictions = pd.concat([train_predictions, rolling_predictions])
full_actual = pd.concat([train_data, test_data])
full_residuals = full_actual - full_predictions
full_mape = round(np.mean(abs(full_residuals/full_actual)) * 100, 2) # Multiply by 100 for percentage
print('Full Dataset Mean Absolute Percentage Error:', full_mape, '%')
print('Root Mean Squared Error:', np.sqrt(np.mean(rolling_residuals**2)))
arima_train_mape = train_mape
arima_test_mape = test_mape
arima_whole_mape = full_mape
# Plotting
plt.figure(figsize=(15, 6))
plt.plot(raw_df['Close'], label='Actual', color='blue')
plt.plot(train_predictions, label='Training Predictions', color='orange')
plt.plot(rolling_predictions, label='Testing Predictions', color='green')
# Add vertical line to separate train and test
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
f'ARIMA Model: Prediction\n'
f'Train MAPE: {train_mape:.2f}%, '
f'Test MAPE: {test_mape:.2f}%, '
f'Whole Data MAPE: {full_mape:.2f}%'
)
plt.ylabel('Close Price')
plt.legend()
for year in range(2005, 2025):
plt.axvline(datetime(year,1,1), color='k', linestyle='--', alpha=0.2)
plt.show()
# Plot residuals
plt.figure(figsize=(15, 4))
plt.plot(rolling_residuals)
plt.axhline(0, linestyle='--', color='k')
plt.title('Testing Forecast Residuals')
plt.ylabel('Error')
plt.grid(True, alpha=0.3)
plt.show()
# Print Write Noice
white_noise_arima = acorr_ljungbox(rolling_residuals, lags = [1], return_df=True)
white_noise_arima
# Print the p-value from the Ljung-Box test
print("\n=== Ljung-Box test Results ===")
print(f"Ljung-Box test p-value: {white_noise_arima['lb_pvalue'][1]}")
# Interpret the result
p_value = white_noise_arima['lb_pvalue'][1]
if p_value > 0.05:
print("\nInterpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests the residuals are independent (white noise).")
else:
print("\nInterpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests the residuals are not independent (not white noise).")
# LM Test for ARCH Effects using het_arch
lm_stat, lm_pval, f_stat, f_pval = het_arch(rolling_residuals)
# Print and interpret LM test
print("\n=== ARCH LM Test Results ===")
print(f"LM test statistic: {lm_stat:.4f}")
print(f"LM test p-value: {lm_pval}")
# Interpret LM test result
if lm_pval > 0.05:
print("\nARCH LM Test Interpretation:")
print("Since p-value > 0.05, we fail to reject the null hypothesis.")
print("This suggests there is no ARCH effect in the residuals.")
else:
print("\nARCH LM Test Interpretation:")
print("Since p-value < 0.05, we reject the null hypothesis.")
print("This suggests there is ARCH effect present in the residuals.")
Train data shape: (228,) Test data shape: (12,) Test Set Mean Absolute Percentage Error: 5.48 % Training Set Mean Absolute Percentage Error: 4.67 % Full Dataset Mean Absolute Percentage Error: 4.85 % Root Mean Squared Error: 4.648457454156803
=== Ljung-Box test Results === Ljung-Box test p-value: 0.21585103820068754 Interpretation: Since p-value > 0.05, we fail to reject the null hypothesis. This suggests the residuals are independent (white noise). === ARCH LM Test Results === LM test statistic: 5.5706 LM test p-value: 0.06171122350105976 ARCH LM Test Interpretation: Since p-value > 0.05, we fail to reject the null hypothesis. This suggests there is no ARCH effect in the residuals.
# Plotting
plt.figure(figsize=(15, 6))
plt.plot(raw_df['Close'], label='Actual', color='blue')
plt.plot(train_predictions, label='Training Predictions', color='orange')
plt.plot(rolling_predictions, label='Testing Predictions', color='green')
# Add vertical line to separate train and test
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
f'ARIMA Model: Prediction\n'
f'Train MAPE: {train_mape:.2f}%, '
f'Test MAPE: {test_mape:.2f}%, '
f'Whole Data MAPE: {full_mape:.2f}%'
)
plt.ylabel('Close Price')
plt.legend()
for year in range(2005, 2025):
plt.axvline(datetime(year,1,1), color='k', linestyle='--', alpha=0.2)
plt.show()
#plot the split data (4 graphs)
plt.figure(figsize=(20, 12))
# 1. Dummy Variable Model (Top Left)
plt.subplot(2, 2, 1)
plt.plot(dummy_df['Date'], y, label='Actual', color='blue')
plt.plot(dummy_df['Date'][train_mask], train_pred, label='Train Estimated', color='orange')
plt.plot(dummy_df['Date'][test_mask], test_pred, label='Test Estimated', color='green')
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
f'Dummy Variable Model: Prediction\n'
f'Train MAPE: {dva_train_mape:.2f}%, Test MAPE: {dva_test_mape:.2f}%, Whole MAPE:{dva_whole_mape:.2f}%'
)
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)
# 2. Additive STL (Top Right)
plt.subplot(2, 2, 2)
plt.plot(df_stl['Close'], label='Actual', color='blue')
plt.plot(estimated_add[train_mask], label='Train Estimated', color='orange')
plt.plot(estimated_add[test_mask], label='Test Estimated', color='green')
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
f'Additive STL: Prediction\n'
f'Train MAPE: {add_train_mape:.2f}%, Test MAPE: {add_test_mape:.2f}%, Whole MAPE:{add_whole_mape:.2f}%'
)
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)
# 3. Multiplicative STL (Bottom Left)
plt.subplot(2, 2, 3)
plt.plot(df_stl['Close'], label='Actual', color='blue')
plt.plot(estimated_mul[train_mask], label='Train Estimated', color='orange')
plt.plot(estimated_mul[test_mask], label='Test Estimated', color='green')
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
f'Multiplicative STL: Prediction\n'
f'Train MAPE: {multi_train_mape:.2f}%, Test MAPE: {multi_test_mape:.2f}%, Whole MAPE:{multi_whole_mape:.2f}%'
)
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)
# 4. ARIMA Model (Bottom Right)
plt.subplot(2, 2, 4)
plt.plot(raw_df['Close'], label='Actual', color='blue')
plt.plot(train_predictions, label='Training Predictions', color='orange')
plt.plot(rolling_predictions, label='Testing Predictions', color='green')
plt.axvline(x=datetime(2023, 12, 31), color='red', linestyle='--', alpha=0.5)
plt.text(datetime(2023, 12, 31), plt.ylim()[0], 'Train-Test Split', rotation=90, verticalalignment='bottom')
plt.title(
f'ARIMA Model: Prediction\n'
f'Train MAPE: {arima_train_mape:.2f}%, Test MAPE: {arima_test_mape:.2f}%, Whole MAPE:{arima_whole_mape:.2f}%'
)
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()